Reanalysis of No evidence for correlations between handgrip strength and sexually dimorphic acoustic properties of voices.
Data and R code see https://osf.io/na2sb/
import os
import numpy as np
import pandas as pd
import seaborn as sns
import pymc3 as pm
import theano.tensor as tt
%pylab inline
%config InlineBackend.figure_format = 'retina'
%load_ext watermark
plt.style.use('ggplot')
%watermark -v -m -p numpy,pandas,seaborn,pymc3,matplotlib
I resaved the data into csv format
tbl = pd.read_csv('final_dataset.csv', ',')
tbl.head()
The main variables is handgrip strength (HGS) and acoustic properties of voices.
(tbl[['age', 'HGS', 'F0', 'F0SD', 'Df_Puts',
'Pf_Puts', 'VTL_Reby']]
.agg(['count', 'mean', 'std'])).T.round(3)
(tbl.groupby(['ethnicity', 'sex'])['age']
.agg(['count'])).T.round(3)
Descriptive data summary and display
def btCI(x):
return sns.utils.ci(sns.algorithms.bootstrap(x, n_boot=1000))
def btCI_025(x):
return btCI(x)[0]
def btCI_975(x):
return btCI(x)[1]
(tbl.groupby(['ethnicity', 'sex'])[['age', 'HGS', 'F0', 'F0SD', 'Df_Puts',
'Pf_Puts', 'VTL_Reby']]
.agg(['mean', btCI_025, btCI_975, 'std'])).T.round(3)
tbl['subgroup'] = tbl['ethnicity']+'-'+tbl['sex']
sns.pairplot(tbl, hue='subgroup',
vars=['HGS', 'F0', 'F0SD', 'Df_Puts', 'Pf_Puts', 'VTL_Reby'],
diag_kind='kde',
plot_kws=dict(alpha=.25));
The linear regressions reported in the manuscript do not account for the repeated measurements of acoustic properties from each subject. Treating the five acoustic measures as the predictor does not properly quantify noise associated with the measurement. To address this issue, we performed a Bayesian analysis with a Multivariate latent model. Acoustics and handgrip strength within each participant are modeled as a realization of the a latent multivariate normal distribution, with the correlation matrix capturing the relationship between acoustics and handgrip strength. The Bayesian analysis is performed in Python using PyMC3 version 3.2, and the results were displayed using Seaborn and Matplotlib.
from patsy import dmatrices
_, X = dmatrices("HGS ~ ethnicity*sex", data=tbl, return_type='matrix')
Terms = X.design_info.column_names
X = np.asarray(X) # design matrix
datamat = tbl[['HGS', 'F0', 'F0SD', 'Df_Puts', 'Pf_Puts', 'VTL_Reby']].values # data
nsbj, npredi = X.shape
Nmeasure = datamat.shape[1]
Terms
$$ \begin{align*} R & \sim \text{LKJCorr}(2)\\ \sigma & \sim \text{HalfNormal}(50)\\ \text{Prior for coefficients}\\ \beta & \sim \text{Normal}(0, 100)\\ \Sigma & = diag(\sigma)*R*diag(\sigma)\\ \mu_{latent} & \sim \text{MvNormal}(X\beta, \Sigma)\\ observed & \sim \text{Normal}(\mu_{latent}, \sigma) \end{align*} $$
The latent measure follows a prior distribution defined by the six-dimensional Gaussian distribution with means $X\beta$ and covariance matrix cov. In practice, we parameterized the Cholesky decomposition of the covariance matrix. We then performed an affine transformation by multiplying the Cholesky factor of cov with a standard Normal and plus $X\beta$.
with pm.Model() as mv_model1:
# prior for covariance
sd_dist = pm.HalfNormal.dist(50)
packed_chol = pm.LKJCholeskyCov('chol_cov', n=Nmeasure, eta=2, sd_dist=sd_dist)
# compute the covariance and correlation matrix
chol = pm.expand_packed_triangular(Nmeasure, packed_chol, lower=True)
cov = pm.Deterministic('cov', tt.dot(chol, chol.T))
# Extract the standard deviations and rho
sd = pm.Deterministic('sd', tt.sqrt(tt.diag(cov)))
corr = pm.Deterministic('corr', tt.diag(sd**-1).dot(cov.dot(tt.diag(sd**-1))))
beta = pm.Normal('beta', mu=0., sd=100., shape=(npredi, Nmeasure))
mu_ = pm.Normal('mu_', mu=0., sd=1., shape=datamat.shape)
mu_group = tt.dot(X, beta) + tt.dot(chol, mu_.T).T
# observed
# sd_ = pm.HalfNormal('sd_', 10, shape=Nmeasure)
obs = pm.Normal('observed', mu=mu_group, sd=sd, observed=datamat)
trace1 = pm.sample(1000, tune=2000, njobs=4,
# nuts_kwargs=dict(target_accept=.95, max_treedepth=15)
)
The probabilistic model was built using PyMC3 and we sampled from the posterior distribution using No-U-Turn Sampler (NUTS). We ran four MCMC chains with 3000 samples. The first 1000 samples were used for tuning the mass matrix and step size for NUTS and were discarded following this. Model convergence was diagnosed by computing Gelman and Rubin's convergence diagnostic (R-hat, 1992), examining the effective sample size, inspecting the mixing of the traces, and checking whether there is any divergent sample that has been returned from the sampler.
pm.traceplot(trace1, varnames=['beta', 'chol_cov', 'sd']);
measure_labels = ['HGS', 'F0', 'F0SD', 'Df_Puts', 'Pf_Puts', 'VTL_Reby']
data_demean = np.zeros_like(datamat)
for ig in tbl['subgroup'].unique():
data_demean[tbl['subgroup']==ig, :] = datamat[tbl['subgroup']==ig, :] - \
datamat[tbl['subgroup']==ig, :].mean(axis=0)
empirical_corr = pd.DataFrame(np.corrcoef(data_demean.T),
columns=measure_labels,
index=measure_labels)
_, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.heatmap(empirical_corr,
cbar=False, square = True, annot=True,
linewidths=.1, cmap='viridis', ax=ax[0])
ax[0].set_title('Empirical Correlation Matrix')
posterior_corr = pd.DataFrame(trace1['corr'].mean(axis=0),
columns=measure_labels,
index=measure_labels)
sns.heatmap(posterior_corr,
cbar=False, square = True, annot=True,
linewidths=.1, cmap='viridis', ax=ax[1])
ax[1].set_title('Latent Correlation Matrix')
plt.tight_layout();
Similar conclusion to the paper could be drawn: The posterior distribution of the correlations between handgrip strength and acoustic measures overlap with zero.
plotpost = pm.plots.artists.plot_posterior_op
_, ax = plt.subplots(2, 3, figsize=(18, 6), sharex=True)
corrtrace = trace1['corr'][:, 0, :]
ax1 = ax.flatten()
for i, measure in enumerate(measure_labels):
if i>0:
trace_values = corrtrace[:, i]
plotpost(trace_values, ax1[i-1], kde_plot=False, point_estimate='mean',
round_to=3, alpha_level=0.05, ref_val=0., rope=None, color='#87ceeb')
ax1[i-1].axvline(empirical_corr.values[0, i], color='r')
ax1[i-1].set_title('Corr('+measure_labels[0]+', '+measure+')')
plt.tight_layout();
ppc1 = pm.sample_ppc(trace1, model=mv_model1, samples=100)
_, ax = plt.subplots(Nmeasure, 1, figsize=(18, 18), sharex=True)
for i in range(Nmeasure):
ax[i].plot(ppc1['observed'][:,:,i].T, 'o', color='k', alpha=.025)
ax[i].plot(datamat[:,i].T, 'o')
ax[i].set_title(measure_labels[i])
ax[i].set_ylim(datamat[:,i].min()-np.std(datamat[:,i]),
datamat[:,i].max()+np.std(datamat[:,i]))
plt.tight_layout();
CONVERGENCE_TITLE = lambda: 'BFMI = {a:.2f}\nmax(R_hat) = {b:.3f}\nmin(Eff_n) = {c:.3f}'\
.format(a=bfmi, b=max_gr, c=min_effn)
def get_diags(trace):
bfmi = pm.bfmi(trace)
max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values())
min_effn = min(np.median(ef_stats) for ef_stats in pm.effective_n(trace).values())
return bfmi, max_gr, min_effn
_, ax = plt.subplots(1, 1, figsize=(7, 5))
bfmi, max_gr, min_effn = get_diags(trace1)
(pm.energyplot(trace1, ax=ax)
.set_title(CONVERGENCE_TITLE()));
nboot = 4000
corr_boot = np.zeros_like(trace1['corr'])
for ib in range(nboot):
boot_idx = np.random.choice(nsbj, nsbj)
corr_boot[ib, :, :] = np.corrcoef(data_demean[boot_idx, :].T)
_, ax = plt.subplots(2, 3, figsize=(18, 6), sharex=True)
ax1 = ax.flatten()
for i, measure in enumerate(measure_labels):
if i>0:
val1 = trace1['corr'][:, 0, i]
sns.distplot(val1, kde=True, ax=ax1[i-1],
label='Posterior Distribution')
val2 = corr_boot[:, 0, i]
sns.distplot(val2, kde=True, ax=ax1[i-1],
label='Bootstrap Distribution')
ax1[i-1].set_title('Corr('+measure_labels[0]+', '+measure+')')
ax1[i-1].axvline(0, color='k')
ax1[i-1].legend()
plt.tight_layout();
$$ \begin{align*} R & \sim \text{LKJCorr}(1)\\ \sigma & \sim \text{HalfNormal}(50)\\ \text{Prior for group and measurement}\\ \mu & \sim \text{Normal}(\overline{measure}, 100)\\ \Sigma & = diag(\sigma)*R*diag(\sigma)\\ observed & \sim \text{MvNormal}(\mu, \Sigma) \end{align*} $$
Alternatively, we treat the observed measurement as is and modelled it directly using a multivariate normal distribution.
X1 = {c:(tbl[c]
.replace(dict(zip(tbl[c].unique(), np.arange(len(tbl[c].unique())))))
.values)
for c in ['subgroup', 'ethnicity', 'sex']}
Ng = len(np.unique(X1['subgroup']))
X1['subgroup']
with pm.Model() as mv_model:
# prior for covariance
sd_dist = pm.HalfNormal.dist(50)
packed_chol = pm.LKJCholeskyCov('chol_cov', n=Nmeasure, eta=1, sd_dist=sd_dist)
# compute the covariance and correlation matrix
chol = pm.expand_packed_triangular(Nmeasure, packed_chol, lower=True)
cov = pm.Deterministic('cov', tt.dot(chol, chol.T))
# Extract the standard deviations and rho
sd = pm.Deterministic('sd', tt.sqrt(tt.diag(cov)))
corr = pm.Deterministic('corr', tt.diag(sd**-1).dot(cov.dot(tt.diag(sd**-1))))
beta = pm.Normal('beta', mu=datamat.mean(axis=0), sd=100., shape=(Ng, Nmeasure))
# observed
obs = pm.MvNormal('observed',
mu=beta[X1['subgroup'], :],
chol=chol,
observed=datamat)
trace = pm.sample(1000, tune=2000, njobs=4
# nuts_kwargs=dict(target_accept=.95, max_treedepth=15)
)
pm.traceplot(trace, varnames=['beta', 'chol_cov', 'sd']);
_, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.heatmap(empirical_corr,
cbar=False, square = True, annot=True,
linewidths=.1, cmap='viridis', ax=ax[0])
ax[0].set_title('Empirical Correlation Matrix')
posterior_corr = pd.DataFrame(trace['corr'].mean(axis=0),
columns=measure_labels,
index=measure_labels)
sns.heatmap(posterior_corr,
cbar=False, square = True, annot=True,
linewidths=.1, cmap='viridis', ax=ax[1])
ax[1].set_title('Latent Correlation Matrix')
plt.tight_layout();
plotpost = pm.plots.artists.plot_posterior_op
_, ax = plt.subplots(2, 3, figsize=(18, 6), sharex=True)
corrtrace = trace['corr'][:, 0, :]
ax1 = ax.flatten()
for i, measure in enumerate(measure_labels):
if i>0:
trace_values = corrtrace[:, i]
plotpost(trace_values, ax1[i-1], kde_plot=False, point_estimate='mean',
round_to=3, alpha_level=0.05, ref_val=0., rope=None, color='#87ceeb')
ax1[i-1].axvline(empirical_corr.values[0, i], color='r')
ax1[i-1].set_title('Corr('+measure_labels[0]+', '+measure+')')
plt.tight_layout();
_, ax = plt.subplots(1, 1, figsize=(7, 5))
bfmi, max_gr, min_effn = get_diags(trace)
(pm.energyplot(trace, ax=ax)
.set_title(CONVERGENCE_TITLE()));
_, ax = plt.subplots(2, 3, figsize=(18, 6), sharex=True)
ax1 = ax.flatten()
for i, measure in enumerate(measure_labels):
if i>0:
val1 = trace['corr'][:, 0, i]
sns.distplot(val1, kde=True, ax=ax1[i-1],
label='Posterior Distribution')
val2 = corr_boot[:, 0, i]
sns.distplot(val2, kde=True, ax=ax1[i-1],
label='Bootstrap Distribution')
ax1[i-1].set_title('Corr('+measure_labels[0]+', '+measure+')')
ax1[i-1].axvline(0, color='k')
ax1[i-1].legend()
plt.tight_layout();
As shown above, we can draw the same conclusion that no correlation between handgrip strength and acoustic measures.